%%javascript
IPython.notebook.clear_all_output();
Team members: Ke Fan, Khalil Jalal Majed
The three data files are the world happiness reports of 2015, 2016 and 2017. Each report is a survey of the state of happiness in most Countries. You can see the data under the directory ./world-happiness/.
The size of happiness data 2015 is 158*12, and its columns are ['Country', 'Region', 'Happiness Rank', 'Happiness Score','Standard Error', 'Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)', 'Freedom', 'Trust (Government Corruption)','Generosity', 'Dystopia Residual'] .
The size of happiness data 2016 is 157*13, and its columns are ['Country', 'Region', 'Happiness Rank', 'Happiness Score','Lower Confidence Interval', 'Upper Confidence Interval','Economy (GDP per Capita)', 'Family', 'Health (Life Expectancy)','Freedom', 'Trust (Government Corruption)', 'Generosity','Dystopia Residual'].
The size of happiness data 2017 is 155*12, and its columns are ['Country', 'Happiness.Rank', 'Happiness.Score', 'Whisker.high','Whisker.low', 'Economy..GDP.per.Capita.', 'Family','Health..Life.Expectancy.', 'Freedom', 'Generosity','Trust..Government.Corruption.', 'Dystopia.Residual'].
Here are the steps of data clean:
The final columns are ['Country', 'Happiness_Rank', 'Happiness_Score','Economy', 'Family', 'Health', 'Freedom', 'Trust','Generosity', 'Dystopia_Residual'].
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
# read three files
data_2015 = pd.read_csv("./world-happiness/2015.csv")
data_2016 = pd.read_csv("./world-happiness/2016.csv")
data_2017 = pd.read_csv("./world-happiness/2017.csv")
# print the size of files
print("happiness data 2015: ", data_2015.shape)
print("happiness data 2016: ", data_2016.shape)
print("happiness data 2017: ", data_2017.shape)
print("\n")
# print the columns of these data to check the difference
print("columns of happiness data 2015: ", data_2015.columns)
print("columns of happiness data 2016: ", data_2016.columns)
print("columns of happiness data 2017: ", data_2017.columns)
# remove differences
data_2015 = data_2015.drop(['Region', 'Standard Error'], axis=1)
data_2016 = data_2016.drop(['Region', 'Lower Confidence Interval','Upper Confidence Interval'], axis=1)
data_2017 = data_2017.drop(['Whisker.high','Whisker.low'], axis=1)
# rename the column names
data_2015.columns = ['Country', 'Happiness_Rank', 'Happiness_Score',
'Economy', 'Family', 'Health', 'Freedom', 'Trust',
'Generosity', 'Dystopia_Residual']
data_2016.columns = ['Country', 'Happiness_Rank', 'Happiness_Score',
'Economy', 'Family', 'Health', 'Freedom', 'Trust',
'Generosity', 'Dystopia_Residual']
data_2017.columns = ['Country', 'Happiness_Rank', 'Happiness_Score',
'Economy', 'Family', 'Health', 'Freedom',
'Generosity', 'Trust', 'Dystopia_Residual']
# swap two columns in data 2017
data_2017 = data_2017[['Country', 'Happiness_Rank', 'Happiness_Score',
'Economy', 'Family', 'Health', 'Freedom', 'Trust',
'Generosity', 'Dystopia_Residual']]
# data combination
happiness = pd.concat([data_2015, data_2016, data_2017])
print("The size of the data: ", happiness.shape)
# check if there is missing data
happiness.isna().sum()
# show several lines as example
happiness.head()
In this part, we showed several graphics to exhibite detailed information of data.
These graphics include:
...
# add some visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
from plotly.offline import iplot
# EDA
happiness[['Happiness_Score', 'Economy', 'Family', 'Health','Freedom', 'Trust',
'Generosity','Dystopia_Residual']].hist(figsize=(18,12), bins=50, grid=True);
# heatmap
plt.subplots(figsize=(10,8))
sns.heatmap(happiness.corr(),cmap='coolwarm',annot=True);
# The Happiness Score map all over the world
dmap = dict(type = 'choropleth',
locations = happiness['Country'],
locationmode = 'country names',
z = happiness['Happiness_Score'],
text = happiness['Country'],
colorbar = {'title':'Happiness score'})
layout = dict(title = 'Happiness Map',
geo = dict(showframe = False,
projection = {'type': 'equirectangular'}))
choromap = go.Figure(data = [dmap], layout=layout)
iplot(choromap)
This part we used several clustering algorithms to cluster the data.
Include:
After each clustering, we also give the graphics to show the clustering results.
# remove irreverent columns
data = happiness.drop(["Country", "Happiness_Rank"], axis=1)
# The number of clusters is defined by Experiments
# I have tried 2, 3, 4, 5 clusters, 3 performs best.
from sklearn.cluster import KMeans
# K-means
def K_means(X, nclust):
model = KMeans(nclust)
model.fit(X)
pred = model.predict(X)
cens = model.cluster_centers_
return (pred, cens)
predict, centers = K_means(data, 3)
kmeans = pd.DataFrame(predict)
happiness.insert((happiness.shape[1]),'kmeans',kmeans)
# Plot the relationship betweem Economy and happiness score based on the clustering result of k means
# X is GDP per Capita, and y is Happiness Score
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(happiness['Economy'],happiness['Happiness_Score'],
c=kmeans[0],s=50)
ax.set_title('K-Means Clustering')
ax.set_xlabel('GDP per Capita')
ax.set_ylabel('Happiness Score')
plt.colorbar(scatter)
# The countries clustering result of K-means
dmap1 = [dict(type='choropleth',
locations = happiness['Country'],
locationmode = 'country names',
z = happiness['kmeans'],
text = happiness['Country'],
colorbar = {'title':'Clusters'})]
layout1 = dict(title='Countries Clustering based on K-Means',
geo=dict(showframe = False,
projection = {'type':'equirectangular'}))
map1 = go.Figure(data = dmap1, layout=layout1)
iplot(map1)
# Guassian Mixture Modelling
from sklearn.mixture import GaussianMixture
def GMM(X, nclust):
model = GaussianMixture(n_components=nclust,init_params='kmeans')
model.fit(X)
pred = model.predict(X)
return (pred)
pred1 = GMM(data, 3)
gmm = pd.DataFrame(pred1)
happiness.insert((happiness.shape[1]),'gmm',gmm)
# Plot the relationship betweem Economy and happiness score based on the clustering result of GMM
# X is GDP per Capita, and y is Happiness Score
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(happiness['Economy'],happiness['Happiness_Score'],
c=gmm[0],s=50)
ax.set_title('GGM Clustering')
ax.set_xlabel('GDP per Capita')
ax.set_ylabel('Happiness Score')
plt.colorbar(scatter)
# The countries clustering result of GGM
dmap2 = [dict(type='choropleth',
locations = happiness['Country'],
locationmode = 'country names',
z = happiness['gmm'],
text = happiness['Country'],
colorbar = {'title':'Clusters'})]
layout2 = dict(title='Countries Clustering based on GGM',
geo=dict(showframe = False,
projection = {'type':'equirectangular'}))
map2 = go.Figure(data = dmap2, layout=layout2)
iplot(map2)
# Agglomerative Clustering
# The number of clusters are defined automatically
from sklearn.cluster import AgglomerativeClustering
def agglomerative(X):
model = AgglomerativeClustering(affinity = 'euclidean', linkage = 'ward')
pred = model.fit_predict(X)
return (pred)
pred2 = agglomerative(data)
agg = pd.DataFrame(pred2)
happiness.insert((happiness.shape[1]),'agg', agg)
# Plot the relationship betweem Economy and happiness score based on
# the clustering result of Agglomerative Clustering
# X is GDP per Capita, and y is Happiness Score
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(happiness['Economy'],happiness['Happiness_Score'],
c=agg[0],s=50)
ax.set_title('Agglomerative Clustering')
ax.set_xlabel('GDP per Capita')
ax.set_ylabel('Happiness Score')
plt.colorbar(scatter)
# The countries clustering result of Agglomerative Clustering
dmap3 = [dict(type='choropleth',
locations = happiness['Country'],
locationmode = 'country names',
z = happiness['agg'],
text = happiness['Country'],
colorbar = {'title':'Clusters'})]
layout3 = dict(title='Countries Clustering based on Agglomerative Clustering',
geo=dict(showframe = False,
projection = {'type':'equirectangular'}))
map3 = go.Figure(data = dmap3, layout=layout3)
iplot(map3)
# Affinity Propagation
# The number of clusters are defined automatically
from sklearn.cluster import AffinityPropagation
def affinity(X):
model = AffinityPropagation(damping = 0.5, max_iter = 250, affinity = 'euclidean')
model.fit(X)
pred = model.predict(X)
return (pred)
pred3 = affinity(data)
aff = pd.DataFrame(pred3)
happiness.insert((happiness.shape[1]),'aff',aff)
# Plot the relationship betweem Economy and happiness score based on
# the clustering result of Affinity Propagation
# X is GDP per Capita, and y is Happiness Score
fig = plt.figure()
ax = fig.add_subplot(111)
scatter = ax.scatter(happiness['Economy'],happiness['Happiness_Score'],
c=aff[0],s=50)
ax.set_title('Affinity Propagation')
ax.set_xlabel('GDP per Capita')
ax.set_ylabel('Happiness Score')
plt.colorbar(scatter)
# The countries clustering result of Affinity Propagation
dmap4 = [dict(type='choropleth',
locations = happiness['Country'],
locationmode = 'country names',
z = happiness['aff'],
text = happiness['Country'],
colorbar = {'title':'Clusters'})]
layout4 = dict(title='Countries Clustering based on Affinity Propagation',
geo=dict(showframe = False,
projection = {'type':'equirectangular'}))
map4 = go.Figure(data = dmap4, layout=layout4)
iplot(map4)
For this part, we used several regression metond to train the data, and predict the happiness scores based on the model.
Include:
The measure methonds include mean squared error, mean absolute error, and r2 score.
Plus, we also plot the comparison result of predict values and real values.
Besides, for several methonds that have a lot of parameters, we used "GridSearchCV" to tune the parameters automatically.
# Split data into X_train, X_test, y_train, y_test (8:2)
from sklearn.model_selection import train_test_split
y = data['Happiness_Score']
X = data.drop('Happiness_Score', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
# linear regression and measure scores
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
model1 = LinearRegression()
model1.fit(X_train, y_train)
print("The estimated intercept is", model1.intercept_)
print("The estimated coefficients are", model1.coef_)
pred1 = model1.predict(X_test)
mse_1 = mean_squared_error(y_test, pred1)
mae_1 = mean_absolute_error(y_test, pred1)
r2_1 = r2_score(y_test, pred1)
print("The value of mean squared error is", mse_1)
print("The value of mean absolute error is", mae_1)
print("The r2 score is", r2_1)
import matplotlib.pyplot as plt
x = pred1
y = y_test
ax = plt.axes()
ax.scatter(x, y)
ax.plot(x, y)
ax.set_xlabel('Predict Happiness score')
ax.set_ylabel('Actual Happiness score')
ax.axis('tight')
# Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(7)
X_poly = poly.fit_transform(X_train)
model2 = LinearRegression()
model2.fit(X_poly, y_train)
pred2 = model2.predict(poly.fit_transform(X_test))
mse_2 = mean_squared_error(y_test, pred2)
mae_2 = mean_absolute_error(y_test, pred2)
r2_2 = r2_score(y_test, pred2)
print("The value of mean squared error is", mse_2)
print("The value of mean absolute error is", mae_2)
print("The r2 score is", r2_2)
x_2 = pred2
y = y_test
ax = plt.axes()
ax.scatter(x_2, y)
ax.plot(x_2, y)
ax.set_xlabel('Predict Happiness score')
ax.set_ylabel('Actual Happiness score')
ax.axis('tight')
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
regr = RandomForestRegressor(max_depth=2, random_state=0)
hyperparameters = {'n_estimators' : [20, 30, 40, 50],
'max_features' : ['auto', 'sqrt', 'log2'],
'max_depth': [None, 15, 10, 5],
'random_state': [10, 15, 20, 25]
}
clf = GridSearchCV(regr, hyperparameters)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
mse_3 = mean_squared_error(y_test, pred)
mae_3 = mean_absolute_error(y_test, pred)
r2_3 = r2_score(y_test, pred)
print("The value of mean squared error is", mse_3)
print("The value of mean absolute error is", mae_3)
print("The r2 score is", r2_3)
x_1 = pred
y = y_test
ax = plt.axes()
ax.scatter(x_1, y)
ax.plot(x_1, y)
ax.set_xlabel('Predict Happiness score')
ax.set_ylabel('Actual Happiness score')
ax.axis('tight')
from sklearn.svm import SVR
import warnings
warnings.filterwarnings("ignore")
svr = SVR(C=1.0)
parameters = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
'gamma' : ['scale', 'auto']
}
clf2 = GridSearchCV(svr, parameters)
clf2.fit(X_train, y_train)
pred3 = clf2.predict(X_test)
print("Best parameters set found on development set:")
print()
print(clf2.best_params_)
print()
mse_4 = mean_squared_error(y_test, pred3)
mae_4 = mean_absolute_error(y_test, pred3)
r2_4 = r2_score(y_test, pred3)
print("The value of mean squared error is", mse_4)
print("The value of mean absolute error is", mae_4)
print("The r2 score is", r2_4)
x_2 = pred3
y = y_test
ax = plt.axes()
ax.scatter(x_2, y)
ax.plot(x_2, y)
ax.set_xlabel('Predict Happiness score')
ax.set_ylabel('Actual Happiness score')
ax.axis('tight')
from sklearn.tree import DecisionTreeRegressor
regr_1 = DecisionTreeRegressor(max_depth=2)
parameters1 = {'criterion' : ['mse', 'friedman_mse', 'mae'],
'max_depth' : [None, 20, 15, 10, 5],
'max_features' : ['auto', 'sqrt', 'log2'],
'random_state': [ 20, 25, 30, 35]
}
clf3 = GridSearchCV(regr_1, parameters1)
clf3.fit(X_train, y_train)
pred4 = clf3.predict(X_test)
print("Best parameters set found on development set:")
print()
print(clf3.best_params_)
print()
mse_5 = mean_squared_error(y_test, pred4)
mae_5 = mean_absolute_error(y_test, pred4)
r2_5 = r2_score(y_test, pred4)
print("The value of mean squared error is", mse_5)
print("The value of mean absolute error is", mae_5)
print("The r2 score is", r2_5)
x_3 = pred4
y = y_test
ax = plt.axes()
ax.scatter(x_3, y)
ax.plot(x_3, y)
ax.set_xlabel('Predict Happiness score')
ax.set_ylabel('Actual Happiness score')
ax.axis('tight')
# The comparison results of these
mean_squared_errors = np.array([mse_1, mse_2, mse_3, mse_4, mse_5])
mean_absolute_errors = np.array([mae_1, mae_2, mae_3, mae_4, mae_5])
r2_scores = np.array([r2_1, r2_2, r2_3, r2_4, r2_5])
fig, ax = plt.subplots()
index = np.arange(5)
bar_width = 0.25
ax.bar(index, mean_squared_errors, width=bar_width, label='mean_squared_errors')
ax.bar(index + bar_width, mean_absolute_errors, width=bar_width, label='mean_absolute_errors')
ax.bar(index + 2 * bar_width, r2_scores, width=bar_width, label='r2_scores')
ax.set_xlabel('Regression methods')
ax.set_ylabel('Scores')
ax.set_title('Comparison result of Regression methods ')
ax.set_xticklabels(('', 'LR', 'PR', 'RFR',
'SVR', 'DTR'))
ax.legend()
plt.show()